import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
df1=pd.read_csv('credit_card.csv') ## Independent data (all features are input features )
df1
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | -18772.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 2 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 |
| 2 | 5009746 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | NaN | -586 | 1 | 1 | 1 | 0 | NaN | 2 |
| 3 | 5009749 | F | Y | N | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1543 | 5028645 | F | N | Y | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -11957.0 | -2182 | 1 | 0 | 0 | 0 | Managers | 2 |
| 1544 | 5023655 | F | N | N | 0 | 225000.0 | Commercial associate | Incomplete higher | Single / not married | House / apartment | -10229.0 | -1209 | 1 | 0 | 0 | 0 | Accountants | 1 |
| 1545 | 5115992 | M | Y | Y | 2 | 180000.0 | Working | Higher education | Married | House / apartment | -13174.0 | -2477 | 1 | 0 | 0 | 0 | Managers | 4 |
| 1546 | 5118219 | M | Y | N | 0 | 270000.0 | Working | Secondary / secondary special | Civil marriage | House / apartment | -15292.0 | -645 | 1 | 1 | 1 | 0 | Drivers | 2 |
| 1547 | 5053790 | F | Y | Y | 0 | 225000.0 | Working | Higher education | Married | House / apartment | -16601.0 | -2859 | 1 | 0 | 0 | 0 | NaN | 2 |
1548 rows × 18 columns
df2=pd.read_csv('credit_card_label.csv')## Dependent data (Y - column )
df2
| Ind_ID | label | |
|---|---|---|
| 0 | 5008827 | 1 |
| 1 | 5009744 | 1 |
| 2 | 5009746 | 1 |
| 3 | 5009749 | 1 |
| 4 | 5009752 | 1 |
| ... | ... | ... |
| 1543 | 5028645 | 0 |
| 1544 | 5023655 | 0 |
| 1545 | 5115992 | 0 |
| 1546 | 5118219 | 0 |
| 1547 | 5053790 | 0 |
1548 rows × 2 columns
## have to merge the input and output data
data = pd.merge(df1,df2,on='Ind_ID')
data
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | -18772.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 2 | 5009746 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | NaN | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 3 | 5009749 | F | Y | N | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1543 | 5028645 | F | N | Y | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -11957.0 | -2182 | 1 | 0 | 0 | 0 | Managers | 2 | 0 |
| 1544 | 5023655 | F | N | N | 0 | 225000.0 | Commercial associate | Incomplete higher | Single / not married | House / apartment | -10229.0 | -1209 | 1 | 0 | 0 | 0 | Accountants | 1 | 0 |
| 1545 | 5115992 | M | Y | Y | 2 | 180000.0 | Working | Higher education | Married | House / apartment | -13174.0 | -2477 | 1 | 0 | 0 | 0 | Managers | 4 | 0 |
| 1546 | 5118219 | M | Y | N | 0 | 270000.0 | Working | Secondary / secondary special | Civil marriage | House / apartment | -15292.0 | -645 | 1 | 1 | 1 | 0 | Drivers | 2 | 0 |
| 1547 | 5053790 | F | Y | Y | 0 | 225000.0 | Working | Higher education | Married | House / apartment | -16601.0 | -2859 | 1 | 0 | 0 | 0 | NaN | 2 | 0 |
1548 rows × 19 columns
data.head() ### check the top 5 rows of our dataframe
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | -18772.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 2 | 5009746 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | NaN | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 3 | 5009749 | F | Y | N | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
data.shape ## check the shape of data
(1548, 19)
data.describe() ## to have a statistical summary about the data
| Ind_ID | CHILDREN | Annual_income | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.548000e+03 | 1548.000000 | 1.525000e+03 | 1526.000000 | 1548.000000 | 1548.0 | 1548.000000 | 1548.000000 | 1548.000000 | 1548.000000 | 1548.000000 |
| mean | 5.078920e+06 | 0.412791 | 1.913993e+05 | -16040.342071 | 59364.689922 | 1.0 | 0.208010 | 0.309432 | 0.092377 | 2.161499 | 0.113049 |
| std | 4.171759e+04 | 0.776691 | 1.132530e+05 | 4229.503202 | 137808.062701 | 0.0 | 0.406015 | 0.462409 | 0.289651 | 0.947772 | 0.316755 |
| min | 5.008827e+06 | 0.000000 | 3.375000e+04 | -24946.000000 | -14887.000000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 25% | 5.045070e+06 | 0.000000 | 1.215000e+05 | -19553.000000 | -3174.500000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 |
| 50% | 5.078842e+06 | 0.000000 | 1.665000e+05 | -15661.500000 | -1565.000000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 |
| 75% | 5.115673e+06 | 1.000000 | 2.250000e+05 | -12417.000000 | -431.750000 | 1.0 | 0.000000 | 1.000000 | 0.000000 | 3.000000 | 0.000000 |
| max | 5.150412e+06 | 14.000000 | 1.575000e+06 | -7705.000000 | 365243.000000 | 1.0 | 1.000000 | 1.000000 | 1.000000 | 15.000000 | 1.000000 |
data.tail() ### check the last 5 rows of our dataframe
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1543 | 5028645 | F | N | Y | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -11957.0 | -2182 | 1 | 0 | 0 | 0 | Managers | 2 | 0 |
| 1544 | 5023655 | F | N | N | 0 | 225000.0 | Commercial associate | Incomplete higher | Single / not married | House / apartment | -10229.0 | -1209 | 1 | 0 | 0 | 0 | Accountants | 1 | 0 |
| 1545 | 5115992 | M | Y | Y | 2 | 180000.0 | Working | Higher education | Married | House / apartment | -13174.0 | -2477 | 1 | 0 | 0 | 0 | Managers | 4 | 0 |
| 1546 | 5118219 | M | Y | N | 0 | 270000.0 | Working | Secondary / secondary special | Civil marriage | House / apartment | -15292.0 | -645 | 1 | 1 | 1 | 0 | Drivers | 2 | 0 |
| 1547 | 5053790 | F | Y | Y | 0 | 225000.0 | Working | Higher education | Married | House / apartment | -16601.0 | -2859 | 1 | 0 | 0 | 0 | NaN | 2 | 0 |
data[data['Employed_days']==365243]
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | -18772.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | Pensioner | Secondary / secondary special | Married | House / apartment | -22134.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 2 | 1 |
| 26 | 5024916 | F | N | Y | 0 | 112500.0 | Pensioner | Secondary / secondary special | Married | House / apartment | -21832.0 | 365243 | 1 | 0 | 1 | 0 | NaN | 2 | 1 |
| 27 | 5024917 | F | N | Y | 0 | NaN | Pensioner | Secondary / secondary special | Married | House / apartment | -21832.0 | 365243 | 1 | 0 | 1 | 0 | NaN | 2 | 1 |
| 31 | 5029311 | F | N | Y | 0 | 112500.0 | Pensioner | Secondary / secondary special | Married | House / apartment | -21876.0 | 365243 | 1 | 0 | 1 | 1 | NaN | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1509 | 5024077 | F | Y | Y | 0 | 157500.0 | Pensioner | Secondary / secondary special | Widow | House / apartment | -20432.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 1 | 0 |
| 1511 | 5053535 | F | N | Y | 0 | 216000.0 | Pensioner | Higher education | Single / not married | House / apartment | -20388.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 1 | 0 |
| 1514 | 5051097 | F | N | N | 0 | 69750.0 | Pensioner | Secondary / secondary special | Widow | Municipal apartment | -21986.0 | 365243 | 1 | 0 | 1 | 0 | NaN | 1 | 0 |
| 1525 | 5023719 | F | Y | Y | 0 | 175500.0 | Pensioner | Higher education | Married | House / apartment | -22077.0 | 365243 | 1 | 0 | 1 | 0 | NaN | 2 | 0 |
| 1531 | 5048642 | F | N | N | 0 | 157500.0 | Pensioner | Secondary / secondary special | Separated | House / apartment | -24048.0 | 365243 | 1 | 0 | 0 | 0 | NaN | 1 | 0 |
261 rows × 19 columns
data['Employed_days'].replace(365243,np.nan,inplace=True) ## replace with Nan where this value present
data
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | -18772.0 | NaN | 1 | 0 | 0 | 0 | NaN | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 2 | 5009746 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | NaN | -586.0 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 3 | 5009749 | F | Y | N | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | NaN | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1543 | 5028645 | F | N | Y | 0 | NaN | Commercial associate | Higher education | Married | House / apartment | -11957.0 | -2182.0 | 1 | 0 | 0 | 0 | Managers | 2 | 0 |
| 1544 | 5023655 | F | N | N | 0 | 225000.0 | Commercial associate | Incomplete higher | Single / not married | House / apartment | -10229.0 | -1209.0 | 1 | 0 | 0 | 0 | Accountants | 1 | 0 |
| 1545 | 5115992 | M | Y | Y | 2 | 180000.0 | Working | Higher education | Married | House / apartment | -13174.0 | -2477.0 | 1 | 0 | 0 | 0 | Managers | 4 | 0 |
| 1546 | 5118219 | M | Y | N | 0 | 270000.0 | Working | Secondary / secondary special | Civil marriage | House / apartment | -15292.0 | -645.0 | 1 | 1 | 1 | 0 | Drivers | 2 | 0 |
| 1547 | 5053790 | F | Y | Y | 0 | 225000.0 | Working | Higher education | Married | House / apartment | -16601.0 | -2859.0 | 1 | 0 | 0 | 0 | NaN | 2 | 0 |
1548 rows × 19 columns
data.isnull().sum()/len(data)*100 ## to check and count the percentage of missing values in each column
Ind_ID 0.000000 GENDER 0.452196 Car_Owner 0.000000 Propert_Owner 0.000000 CHILDREN 0.000000 Annual_income 1.485788 Type_Income 0.000000 EDUCATION 0.000000 Marital_status 0.000000 Housing_type 0.000000 Birthday_count 1.421189 Employed_days 16.860465 Mobile_phone 0.000000 Work_Phone 0.000000 Phone 0.000000 EMAIL_ID 0.000000 Type_Occupation 31.524548 Family_Members 0.000000 label 0.000000 dtype: float64
import missingno as msno ## importing library to handle missing values
msno.bar(data)
<Axes: >
data['Type_Occupation'].value_counts() ## to see the value counts in occupation column
Laborers 268 Core staff 174 Managers 136 Sales staff 122 Drivers 86 High skill tech staff 65 Medicine staff 50 Accountants 44 Security staff 25 Cleaning staff 22 Cooking staff 21 Private service staff 17 Secretaries 9 Low-skill Laborers 9 Waiters/barmen staff 5 HR staff 3 IT staff 2 Realty agents 2 Name: Type_Occupation, dtype: int64
data.info() ## to check the datatype of columns
<class 'pandas.core.frame.DataFrame'> Int64Index: 1548 entries, 0 to 1547 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ind_ID 1548 non-null int64 1 GENDER 1541 non-null object 2 Car_Owner 1548 non-null object 3 Propert_Owner 1548 non-null object 4 CHILDREN 1548 non-null int64 5 Annual_income 1525 non-null float64 6 Type_Income 1548 non-null object 7 EDUCATION 1548 non-null object 8 Marital_status 1548 non-null object 9 Housing_type 1548 non-null object 10 Birthday_count 1526 non-null float64 11 Employed_days 1287 non-null float64 12 Mobile_phone 1548 non-null int64 13 Work_Phone 1548 non-null int64 14 Phone 1548 non-null int64 15 EMAIL_ID 1548 non-null int64 16 Type_Occupation 1060 non-null object 17 Family_Members 1548 non-null int64 18 label 1548 non-null int64 dtypes: float64(3), int64(8), object(8) memory usage: 241.9+ KB
data['Type_Occupation']=data['Type_Occupation'].fillna(data['Type_Occupation'].mode()[0])
data.isnull().sum()
Ind_ID 0 GENDER 7 Car_Owner 0 Propert_Owner 0 CHILDREN 0 Annual_income 23 Type_Income 0 EDUCATION 0 Marital_status 0 Housing_type 0 Birthday_count 22 Employed_days 261 Mobile_phone 0 Work_Phone 0 Phone 0 EMAIL_ID 0 Type_Occupation 0 Family_Members 0 label 0 dtype: int64
data['Employed_days'].median() ## find median of this column
-1953.0
## used box plot to locate outliers in Employed_days column
import seaborn as sns
sns.boxplot(data['Employed_days'])
<Axes: >
## to check distribution of 'Employed_days' column
data['Employed_days']=data['Employed_days'].fillna(data['Employed_days'].median())
data = data.dropna() ## to drop the rows which have even one missing values
data.isnull().sum() ## we can see that now the data has no missing values
Ind_ID 0 GENDER 0 Car_Owner 0 Propert_Owner 0 CHILDREN 0 Annual_income 0 Type_Income 0 EDUCATION 0 Marital_status 0 Housing_type 0 Birthday_count 0 Employed_days 0 Mobile_phone 0 Work_Phone 0 Phone 0 EMAIL_ID 0 Type_Occupation 0 Family_Members 0 label 0 dtype: int64
data.describe() ## to have a quick view of columns which have outliers
| Ind_ID | CHILDREN | Annual_income | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.496000e+03 | 1496.000000 | 1.496000e+03 | 1496.000000 | 1496.000000 | 1496.0 | 1496.000000 | 1496.000000 | 1496.000000 | 1496.000000 | 1496.000000 |
| mean | 5.079217e+06 | 0.415775 | 1.907750e+05 | -16036.192513 | -2563.189840 | 1.0 | 0.205882 | 0.304813 | 0.094251 | 2.165107 | 0.106952 |
| std | 4.168109e+04 | 0.780784 | 1.131384e+05 | 4226.506557 | 2218.642455 | 0.0 | 0.404480 | 0.460482 | 0.292276 | 0.951752 | 0.309155 |
| min | 5.008827e+06 | 0.000000 | 3.375000e+04 | -24946.000000 | -14887.000000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 25% | 5.045349e+06 | 0.000000 | 1.210500e+05 | -19543.000000 | -3229.250000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 |
| 50% | 5.079010e+06 | 0.000000 | 1.660500e+05 | -15686.000000 | -1953.000000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 |
| 75% | 5.115801e+06 | 1.000000 | 2.250000e+05 | -12417.000000 | -1169.500000 | 1.0 | 0.000000 | 1.000000 | 0.000000 | 3.000000 | 0.000000 |
| max | 5.150412e+06 | 14.000000 | 1.575000e+06 | -7705.000000 | -73.000000 | 1.0 | 1.000000 | 1.000000 | 1.000000 | 15.000000 | 1.000000 |
sns.boxplot(data['Employed_days']) ## we can see that this column have a lot of outliers
<Axes: >
Q1 = data['Employed_days'].quantile(0.25)
Q3 = data['Employed_days'].quantile(0.75)
Q1,Q3
(-3229.25, -1169.5)
IQR = Q3-Q1
IQR
2059.75
low_lim = Q1 - 1.5 * IQR
high_lim = Q3 + 1.5 * IQR
data = data[(data['Employed_days'] > low_lim) & (data['Employed_days'] < high_lim)] ## select the rows greater than lower fence and
## less than higher fence.
data
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | -18772.0 | -1953.0 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 6 | 5009754 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | Pensioner | Secondary / secondary special | Married | House / apartment | -22134.0 | -1953.0 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | 5118268 | M | Y | N | 1 | 360000.0 | State servant | Secondary / secondary special | Married | House / apartment | -11294.0 | -3536.0 | 1 | 0 | 1 | 0 | Drivers | 3 | 0 |
| 1544 | 5023655 | F | N | N | 0 | 225000.0 | Commercial associate | Incomplete higher | Single / not married | House / apartment | -10229.0 | -1209.0 | 1 | 0 | 0 | 0 | Accountants | 1 | 0 |
| 1545 | 5115992 | M | Y | Y | 2 | 180000.0 | Working | Higher education | Married | House / apartment | -13174.0 | -2477.0 | 1 | 0 | 0 | 0 | Managers | 4 | 0 |
| 1546 | 5118219 | M | Y | N | 0 | 270000.0 | Working | Secondary / secondary special | Civil marriage | House / apartment | -15292.0 | -645.0 | 1 | 1 | 1 | 0 | Drivers | 2 | 0 |
| 1547 | 5053790 | F | Y | Y | 0 | 225000.0 | Working | Higher education | Married | House / apartment | -16601.0 | -2859.0 | 1 | 0 | 0 | 0 | Laborers | 2 | 0 |
1382 rows × 19 columns
#### Kernel density estimator is used to see the distibution
sns.kdeplot(data['Annual_income'])
<Axes: xlabel='Annual_income', ylabel='Density'>
Q1 = data['Annual_income'].quantile(0.25)
Q3 = data['Annual_income'].quantile(0.75)
Q1,Q3
(117000.0, 225000.0)
IQR = Q3-Q1
IQR
108000.0
low_lim = Q1 - 1.5 * IQR
high_lim = Q3 + 1.5 * IQR
data = data[(data['Annual_income'] > low_lim) & (data['Annual_income'] < high_lim)]
data
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | -18772.0 | -1953.0 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 6 | 5009754 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | -13557.0 | -586.0 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | Pensioner | Secondary / secondary special | Married | House / apartment | -22134.0 | -1953.0 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | 5118268 | M | Y | N | 1 | 360000.0 | State servant | Secondary / secondary special | Married | House / apartment | -11294.0 | -3536.0 | 1 | 0 | 1 | 0 | Drivers | 3 | 0 |
| 1544 | 5023655 | F | N | N | 0 | 225000.0 | Commercial associate | Incomplete higher | Single / not married | House / apartment | -10229.0 | -1209.0 | 1 | 0 | 0 | 0 | Accountants | 1 | 0 |
| 1545 | 5115992 | M | Y | Y | 2 | 180000.0 | Working | Higher education | Married | House / apartment | -13174.0 | -2477.0 | 1 | 0 | 0 | 0 | Managers | 4 | 0 |
| 1546 | 5118219 | M | Y | N | 0 | 270000.0 | Working | Secondary / secondary special | Civil marriage | House / apartment | -15292.0 | -645.0 | 1 | 1 | 1 | 0 | Drivers | 2 | 0 |
| 1547 | 5053790 | F | Y | Y | 0 | 225000.0 | Working | Higher education | Married | House / apartment | -16601.0 | -2859.0 | 1 | 0 | 0 | 0 | Laborers | 2 | 0 |
1315 rows × 19 columns
data.describe() ## we can see that now the data has no outliers.
| Ind_ID | CHILDREN | Annual_income | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.315000e+03 | 1315.000000 | 1315.000000 | 1315.000000 | 1315.000000 | 1315.0 | 1315.000000 | 1315.000000 | 1315.000000 | 1315.000000 | 1315.000000 |
| mean | 5.078908e+06 | 0.403042 | 174604.123574 | -15928.488973 | -2070.008365 | 1.0 | 0.202281 | 0.293536 | 0.088213 | 2.156654 | 0.101901 |
| std | 4.157915e+04 | 0.783383 | 72572.383220 | 4322.200327 | 1343.177560 | 0.0 | 0.401854 | 0.455555 | 0.283712 | 0.951539 | 0.302633 |
| min | 5.008827e+06 | 0.000000 | 33750.000000 | -24946.000000 | -6317.000000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 25% | 5.045254e+06 | 0.000000 | 112500.000000 | -19604.000000 | -2653.500000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 |
| 50% | 5.069292e+06 | 0.000000 | 157500.000000 | -15366.000000 | -1953.000000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 |
| 75% | 5.115666e+06 | 1.000000 | 225000.000000 | -12262.000000 | -1113.500000 | 1.0 | 0.000000 | 1.000000 | 0.000000 | 3.000000 | 0.000000 |
| max | 5.150412e+06 | 14.000000 | 382500.000000 | -7705.000000 | -73.000000 | 1.0 | 1.000000 | 1.000000 | 1.000000 | 15.000000 | 1.000000 |
sns.kdeplot(data['Annual_income'])
<Axes: xlabel='Annual_income', ylabel='Density'>
data.to_csv(r'C:\Users\papai\Desktop\New folder (2)\cleaned_data.csv', index=False)
sns.countplot(x = 'GENDER', data = data, hue = 'Car_Owner')
plt.show()
data.GENDER.value_counts()
F 838 M 477 Name: GENDER, dtype: int64
# Compare target column with sex column
pd.crosstab(data.label, data.GENDER)
| GENDER | F | M |
|---|---|---|
| label | ||
| 0 | 759 | 422 |
| 1 | 79 | 55 |
sns.countplot(x = 'label', data = data, hue = 'GENDER')
plt.show()
data[['Birthday_count','Employed_days']] = data[['Birthday_count','Employed_days']].div(-365,axis=0)
C:\Users\papai\AppData\Local\Temp\ipykernel_9776\3941418353.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data[['Birthday_count','Employed_days']] = data[['Birthday_count','Employed_days']].div(-365,axis=0)
data.head()
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Birthday_count | Employed_days | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | 51.430137 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 6 | 5009754 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | Pensioner | Secondary / secondary special | Married | House / apartment | 60.641096 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
data.rename(columns={'Birthday_count':'Age','Employed_days':'work_exp_years'},inplace=True)
C:\Users\papai\AppData\Local\Temp\ipykernel_9776\72437478.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data.rename(columns={'Birthday_count':'Age','Employed_days':'work_exp_years'},inplace=True)
data.head()
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Age | work_exp_years | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | 51.430137 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 6 | 5009754 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | Pensioner | Secondary / secondary special | Married | House / apartment | 60.641096 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
sns.lmplot(x = 'Age', y = 'Annual_income', data = data,hue='GENDER',col='GENDER')
plt.show()
data['Age'].corr(data['Annual_income'])
-0.11426994953460573
sns.displot(data['Age'],kde=True)
<seaborn.axisgrid.FacetGrid at 0x270878a1990>
sns.countplot(y = 'EDUCATION', data = data, hue = 'GENDER')
plt.show()
sns.relplot(x = 'work_exp_years', y = 'Annual_income', data = data, kind = 'scatter', hue = 'GENDER', col = 'GENDER')
plt.show()
data['work_exp_years'].corr(data['Annual_income'])
0.09005602936710526
sns.boxplot(x = 'Type_Income', y = 'Annual_income', data = data)
plt.show()
# Let's see how many positive (0) and negative (1) samples we have in our dataframe
data['label'].value_counts()
0 1181 1 134 Name: label, dtype: int64
# Normalized value counts
data.label.value_counts(normalize=True)
0 0.898099 1 0.101901 Name: label, dtype: float64
data.label.value_counts().plot(kind="bar", color=[ 'lightblue','k'])
plt.title('Credit Approval Transaction')
plt.xlabel('Class')
plt.ylabel('Frequency');
data.nunique() ## to see the unique values of column
Ind_ID 1315 GENDER 2 Car_Owner 2 Propert_Owner 2 CHILDREN 5 Annual_income 91 Type_Income 4 EDUCATION 5 Marital_status 5 Housing_type 6 Age 1112 work_exp_years 801 Mobile_phone 1 Work_Phone 2 Phone 2 EMAIL_ID 2 Type_Occupation 18 Family_Members 6 label 2 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1315 entries, 0 to 1547 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ind_ID 1315 non-null int64 1 GENDER 1315 non-null object 2 Car_Owner 1315 non-null object 3 Propert_Owner 1315 non-null object 4 CHILDREN 1315 non-null int64 5 Annual_income 1315 non-null float64 6 Type_Income 1315 non-null object 7 EDUCATION 1315 non-null object 8 Marital_status 1315 non-null object 9 Housing_type 1315 non-null object 10 Age 1315 non-null float64 11 work_exp_years 1315 non-null float64 12 Mobile_phone 1315 non-null int64 13 Work_Phone 1315 non-null int64 14 Phone 1315 non-null int64 15 EMAIL_ID 1315 non-null int64 16 Type_Occupation 1315 non-null object 17 Family_Members 1315 non-null int64 18 label 1315 non-null int64 dtypes: float64(3), int64(8), object(8) memory usage: 237.8+ KB
df = data.copy() ## make a copy of original data
df
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Age | work_exp_years | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | Pensioner | Higher education | Married | House / apartment | 51.430137 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 6 | 5009754 | F | Y | N | 0 | 315000.0 | Commercial associate | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | Pensioner | Secondary / secondary special | Married | House / apartment | 60.641096 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | 5118268 | M | Y | N | 1 | 360000.0 | State servant | Secondary / secondary special | Married | House / apartment | 30.942466 | 9.687671 | 1 | 0 | 1 | 0 | Drivers | 3 | 0 |
| 1544 | 5023655 | F | N | N | 0 | 225000.0 | Commercial associate | Incomplete higher | Single / not married | House / apartment | 28.024658 | 3.312329 | 1 | 0 | 0 | 0 | Accountants | 1 | 0 |
| 1545 | 5115992 | M | Y | Y | 2 | 180000.0 | Working | Higher education | Married | House / apartment | 36.093151 | 6.786301 | 1 | 0 | 0 | 0 | Managers | 4 | 0 |
| 1546 | 5118219 | M | Y | N | 0 | 270000.0 | Working | Secondary / secondary special | Civil marriage | House / apartment | 41.895890 | 1.767123 | 1 | 1 | 1 | 0 | Drivers | 2 | 0 |
| 1547 | 5053790 | F | Y | Y | 0 | 225000.0 | Working | Higher education | Married | House / apartment | 45.482192 | 7.832877 | 1 | 0 | 0 | 0 | Laborers | 2 | 0 |
1315 rows × 19 columns
df['Type_Income'].value_counts()
Working 677 Commercial associate 304 Pensioner 250 State servant 84 Name: Type_Income, dtype: int64
df['Housing_type'].value_counts()
House / apartment 1170 With parents 72 Municipal apartment 45 Rented apartment 19 Office apartment 6 Co-op apartment 3 Name: Housing_type, dtype: int64
df.select_dtypes('object') ## to select the categorical columns
| GENDER | Car_Owner | Propert_Owner | Type_Income | EDUCATION | Marital_status | Housing_type | Type_Occupation | |
|---|---|---|---|---|---|---|---|---|
| 0 | M | Y | Y | Pensioner | Higher education | Married | House / apartment | Laborers |
| 1 | F | Y | N | Commercial associate | Higher education | Married | House / apartment | Laborers |
| 4 | F | Y | N | Commercial associate | Higher education | Married | House / apartment | Laborers |
| 6 | F | Y | N | Commercial associate | Higher education | Married | House / apartment | Laborers |
| 7 | F | N | N | Pensioner | Secondary / secondary special | Married | House / apartment | Laborers |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | M | Y | N | State servant | Secondary / secondary special | Married | House / apartment | Drivers |
| 1544 | F | N | N | Commercial associate | Incomplete higher | Single / not married | House / apartment | Accountants |
| 1545 | M | Y | Y | Working | Higher education | Married | House / apartment | Managers |
| 1546 | M | Y | N | Working | Secondary / secondary special | Civil marriage | House / apartment | Drivers |
| 1547 | F | Y | Y | Working | Higher education | Married | House / apartment | Laborers |
1315 rows × 8 columns
dummy_data = pd.get_dummies(data[['GENDER','Car_Owner','Propert_Owner','Marital_status','Housing_type']],drop_first = True)
dummy_data
| GENDER_M | Car_Owner_Y | Propert_Owner_Y | Marital_status_Married | Marital_status_Separated | Marital_status_Single / not married | Marital_status_Widow | Housing_type_House / apartment | Housing_type_Municipal apartment | Housing_type_Office apartment | Housing_type_Rented apartment | Housing_type_With parents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 6 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1544 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1545 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1546 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1547 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1315 rows × 12 columns
from sklearn.preprocessing import OrdinalEncoder
OE = OrdinalEncoder()
OE.fit_transform(df[['Type_Income']])
array([[1.],
[0.],
[0.],
...,
[3.],
[3.],
[3.]])
df['Type_Income']=OE.fit_transform(df[['Type_Income']])
df.head()
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Age | work_exp_years | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | 1.0 | Higher education | Married | House / apartment | 51.430137 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | 0.0 | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | 0.0 | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 6 | 5009754 | F | Y | N | 0 | 315000.0 | 0.0 | Higher education | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | Laborers | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | 1.0 | Secondary / secondary special | Married | House / apartment | 60.641096 | 5.350685 | 1 | 0 | 0 | 0 | Laborers | 2 | 1 |
df['EDUCATION']=OE.fit_transform(df[['EDUCATION']])
df['Type_Occupation']=OE.fit_transform(df[['Type_Occupation']])
df.head()
| Ind_ID | GENDER | Car_Owner | Propert_Owner | CHILDREN | Annual_income | Type_Income | EDUCATION | Marital_status | Housing_type | Age | work_exp_years | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | Family_Members | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | M | Y | Y | 0 | 180000.0 | 1.0 | 1.0 | Married | House / apartment | 51.430137 | 5.350685 | 1 | 0 | 0 | 0 | 8.0 | 2 | 1 |
| 1 | 5009744 | F | Y | N | 0 | 315000.0 | 0.0 | 1.0 | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | 8.0 | 2 | 1 |
| 4 | 5009752 | F | Y | N | 0 | 315000.0 | 0.0 | 1.0 | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | 8.0 | 2 | 1 |
| 6 | 5009754 | F | Y | N | 0 | 315000.0 | 0.0 | 1.0 | Married | House / apartment | 37.142466 | 1.605479 | 1 | 1 | 1 | 0 | 8.0 | 2 | 1 |
| 7 | 5009894 | F | N | N | 0 | 180000.0 | 1.0 | 4.0 | Married | House / apartment | 60.641096 | 5.350685 | 1 | 0 | 0 | 0 | 8.0 | 2 | 1 |
df_fin = pd.concat([df,dummy_data],axis=1)
df_fin.columns ### These are all the columns after encoding
Index(['Ind_ID', 'GENDER', 'Car_Owner', 'Propert_Owner', 'CHILDREN',
'Annual_income', 'Type_Income', 'EDUCATION', 'Marital_status',
'Housing_type', 'Age', 'work_exp_years', 'Mobile_phone', 'Work_Phone',
'Phone', 'EMAIL_ID', 'Type_Occupation', 'Family_Members', 'label',
'GENDER_M', 'Car_Owner_Y', 'Propert_Owner_Y', 'Marital_status_Married',
'Marital_status_Separated', 'Marital_status_Single / not married',
'Marital_status_Widow', 'Housing_type_House / apartment',
'Housing_type_Municipal apartment', 'Housing_type_Office apartment',
'Housing_type_Rented apartment', 'Housing_type_With parents'],
dtype='object')
df_fin.drop(columns=['GENDER','Car_Owner','Propert_Owner','Marital_status','Housing_type'],axis=1,inplace=True)
df_fin
| Ind_ID | CHILDREN | Annual_income | Type_Income | EDUCATION | Age | work_exp_years | Mobile_phone | Work_Phone | Phone | ... | Propert_Owner_Y | Marital_status_Married | Marital_status_Separated | Marital_status_Single / not married | Marital_status_Widow | Housing_type_House / apartment | Housing_type_Municipal apartment | Housing_type_Office apartment | Housing_type_Rented apartment | Housing_type_With parents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | 0 | 180000.0 | 1.0 | 1.0 | 51.430137 | 5.350685 | 1 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 5009744 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 5009752 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 6 | 5009754 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7 | 5009894 | 0 | 180000.0 | 1.0 | 4.0 | 60.641096 | 5.350685 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | 5118268 | 1 | 360000.0 | 2.0 | 4.0 | 30.942466 | 9.687671 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1544 | 5023655 | 0 | 225000.0 | 0.0 | 2.0 | 28.024658 | 3.312329 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1545 | 5115992 | 2 | 180000.0 | 3.0 | 1.0 | 36.093151 | 6.786301 | 1 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1546 | 5118219 | 0 | 270000.0 | 3.0 | 4.0 | 41.895890 | 1.767123 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1547 | 5053790 | 0 | 225000.0 | 3.0 | 1.0 | 45.482192 | 7.832877 | 1 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1315 rows × 26 columns
from sklearn.feature_selection import chi2
df_fin.info() ## we can see that all columns have been converted to numerical
<class 'pandas.core.frame.DataFrame'> Int64Index: 1315 entries, 0 to 1547 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ind_ID 1315 non-null int64 1 CHILDREN 1315 non-null int64 2 Annual_income 1315 non-null float64 3 Type_Income 1315 non-null float64 4 EDUCATION 1315 non-null float64 5 Age 1315 non-null float64 6 work_exp_years 1315 non-null float64 7 Mobile_phone 1315 non-null int64 8 Work_Phone 1315 non-null int64 9 Phone 1315 non-null int64 10 EMAIL_ID 1315 non-null int64 11 Type_Occupation 1315 non-null float64 12 Family_Members 1315 non-null int64 13 label 1315 non-null int64 14 GENDER_M 1315 non-null uint8 15 Car_Owner_Y 1315 non-null uint8 16 Propert_Owner_Y 1315 non-null uint8 17 Marital_status_Married 1315 non-null uint8 18 Marital_status_Separated 1315 non-null uint8 19 Marital_status_Single / not married 1315 non-null uint8 20 Marital_status_Widow 1315 non-null uint8 21 Housing_type_House / apartment 1315 non-null uint8 22 Housing_type_Municipal apartment 1315 non-null uint8 23 Housing_type_Office apartment 1315 non-null uint8 24 Housing_type_Rented apartment 1315 non-null uint8 25 Housing_type_With parents 1315 non-null uint8 dtypes: float64(6), int64(8), uint8(12) memory usage: 201.8 KB
df_chi2 = df_fin.drop(columns=['Ind_ID','Annual_income','Age','work_exp_years','Family_Members']) ## Select categorical columns
## for chi2 test
df_chi2
| CHILDREN | Type_Income | EDUCATION | Mobile_phone | Work_Phone | Phone | EMAIL_ID | Type_Occupation | label | GENDER_M | ... | Propert_Owner_Y | Marital_status_Married | Marital_status_Separated | Marital_status_Single / not married | Marital_status_Widow | Housing_type_House / apartment | Housing_type_Municipal apartment | Housing_type_Office apartment | Housing_type_Rented apartment | Housing_type_With parents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1.0 | 1.0 | 1 | 0 | 0 | 0 | 8.0 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0.0 | 1.0 | 1 | 1 | 1 | 0 | 8.0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0.0 | 1.0 | 1 | 1 | 1 | 0 | 8.0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 6 | 0 | 0.0 | 1.0 | 1 | 1 | 1 | 0 | 8.0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7 | 0 | 1.0 | 4.0 | 1 | 0 | 0 | 0 | 8.0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | 1 | 2.0 | 4.0 | 1 | 0 | 1 | 0 | 4.0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1544 | 0 | 0.0 | 2.0 | 1 | 0 | 0 | 0 | 0.0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1545 | 2 | 3.0 | 1.0 | 1 | 0 | 0 | 0 | 10.0 | 0 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1546 | 0 | 3.0 | 4.0 | 1 | 1 | 1 | 0 | 4.0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1547 | 0 | 3.0 | 1.0 | 1 | 0 | 0 | 0 | 8.0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1315 rows × 21 columns
x = df_chi2.drop('label',axis=1)
y = df['label']
f_p_value = chi2(x,y) ## Returns two value 1. Fscore 2. p value
f_p_value
(array([6.68548364e+00, 1.88530120e+00, 2.82731008e-02, 0.00000000e+00,
3.96218851e-01, 3.14629247e-01, 3.12201947e-01, 6.06530326e-01,
9.36287057e-01, 4.07308765e-01, 2.77324911e-03, 2.93617810e-01,
1.41131075e+00, 2.31987958e+00, 6.67337880e-02, 6.31702346e-01,
9.99086209e+00, 3.51152367e+00, 2.44969147e+00, 1.68984179e+00]),
array([0.00972011, 0.16973285, 0.86646837, 1. , 0.52904849,
0.57485329, 0.57633212, 0.4360972 , 0.33323455, 0.52333864,
0.9580015 , 0.58791143, 0.23483892, 0.12772991, 0.79615313,
0.42673157, 0.00157319, 0.06094338, 0.11754797, 0.19362183]))
feature_scores = pd.DataFrame({'F_score':f_p_value[0],'p_values':f_p_value[1]},index = x.columns)
feature_scores.sort_values(by='p_values', ascending=True)
| F_score | p_values | |
|---|---|---|
| Housing_type_Municipal apartment | 9.990862 | 0.001573 |
| CHILDREN | 6.685484 | 0.009720 |
| Housing_type_Office apartment | 3.511524 | 0.060943 |
| Housing_type_Rented apartment | 2.449691 | 0.117548 |
| Marital_status_Single / not married | 2.319880 | 0.127730 |
| Type_Income | 1.885301 | 0.169733 |
| Housing_type_With parents | 1.689842 | 0.193622 |
| Marital_status_Separated | 1.411311 | 0.234839 |
| GENDER_M | 0.936287 | 0.333235 |
| Housing_type_House / apartment | 0.631702 | 0.426732 |
| Type_Occupation | 0.606530 | 0.436097 |
| Car_Owner_Y | 0.407309 | 0.523339 |
| Work_Phone | 0.396219 | 0.529048 |
| Phone | 0.314629 | 0.574853 |
| EMAIL_ID | 0.312202 | 0.576332 |
| Marital_status_Married | 0.293618 | 0.587911 |
| Marital_status_Widow | 0.066734 | 0.796153 |
| EDUCATION | 0.028273 | 0.866468 |
| Propert_Owner_Y | 0.002773 | 0.958001 |
| Mobile_phone | 0.000000 | 1.000000 |
df_fin.drop(columns=['Mobile_phone','Propert_Owner_Y'],inplace =True)
df_fin
| Ind_ID | CHILDREN | Annual_income | Type_Income | EDUCATION | Age | work_exp_years | Work_Phone | Phone | EMAIL_ID | ... | Car_Owner_Y | Marital_status_Married | Marital_status_Separated | Marital_status_Single / not married | Marital_status_Widow | Housing_type_House / apartment | Housing_type_Municipal apartment | Housing_type_Office apartment | Housing_type_Rented apartment | Housing_type_With parents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | 0 | 180000.0 | 1.0 | 1.0 | 51.430137 | 5.350685 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 5009744 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 5009752 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 6 | 5009754 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7 | 5009894 | 0 | 180000.0 | 1.0 | 4.0 | 60.641096 | 5.350685 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1542 | 5118268 | 1 | 360000.0 | 2.0 | 4.0 | 30.942466 | 9.687671 | 0 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1544 | 5023655 | 0 | 225000.0 | 0.0 | 2.0 | 28.024658 | 3.312329 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1545 | 5115992 | 2 | 180000.0 | 3.0 | 1.0 | 36.093151 | 6.786301 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1546 | 5118219 | 0 | 270000.0 | 3.0 | 4.0 | 41.895890 | 1.767123 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1547 | 5053790 | 0 | 225000.0 | 3.0 | 1.0 | 45.482192 | 7.832877 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1315 rows × 24 columns
num_col = df_fin[['Annual_income','Age','work_exp_years','Family_Members']]
num_col.corr()
| Annual_income | Age | work_exp_years | Family_Members | |
|---|---|---|---|---|
| Annual_income | 1.000000 | -0.114270 | 0.090056 | 0.059224 |
| Age | -0.114270 | 1.000000 | 0.102264 | -0.273596 |
| work_exp_years | 0.090056 | 0.102264 | 1.000000 | 0.053713 |
| Family_Members | 0.059224 | -0.273596 | 0.053713 | 1.000000 |
corr= num_col.corr()
sns.heatmap(corr,annot=True)
plt.show()
classes=pd.value_counts(df_fin['label'],sort=True)
classes.plot(kind='bar',color =['green','red'],rot=0)
plt.title('Credit Approval Transaction')
plt.xlabel('Class')
plt.ylabel('Frequency')
Text(0, 0.5, 'Frequency')
df_fin['label'].value_counts()
0 1181 1 134 Name: label, dtype: int64
Approved = df_fin[df_fin['label']==0]
Rejected = df_fin[df_fin['label']==1]
print(Approved.shape,Rejected.shape)
(1181, 24) (134, 24)
from imblearn.combine import SMOTETomek ## importing necessary libraries
columns = df_fin.columns.tolist()
columns = [c for c in columns if c not in ['label']]
X = df_fin[columns]
X.head() ## Independent Data
| Ind_ID | CHILDREN | Annual_income | Type_Income | EDUCATION | Age | work_exp_years | Work_Phone | Phone | EMAIL_ID | ... | Car_Owner_Y | Marital_status_Married | Marital_status_Separated | Marital_status_Single / not married | Marital_status_Widow | Housing_type_House / apartment | Housing_type_Municipal apartment | Housing_type_Office apartment | Housing_type_Rented apartment | Housing_type_With parents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | 0 | 180000.0 | 1.0 | 1.0 | 51.430137 | 5.350685 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 5009744 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 5009752 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 6 | 5009754 | 0 | 315000.0 | 0.0 | 1.0 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7 | 5009894 | 0 | 180000.0 | 1.0 | 4.0 | 60.641096 | 5.350685 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
5 rows × 23 columns
Y = df_fin['label'] ## Output data
smk = SMOTETomek(random_state=42)
X_res,Y_res=smk.fit_resample(X,Y)
print(X_res.shape,Y_res.shape)
(2156, 23) (2156,)
from collections import Counter
print('Original dataset shape{}'.format(Counter(Y)))
print('Resampled dataset shape{}'.format(Counter(Y_res)))
Original dataset shapeCounter({0: 1181, 1: 134})
Resampled dataset shapeCounter({1: 1078, 0: 1078})
Y_res.value_counts().plot(kind="bar", color=[ 'lightblue','k'])
plt.title('Credit Approval Transaction')
plt.xlabel('Class')
plt.ylabel('Frequency')
Text(0, 0.5, 'Frequency')
df_fin = pd.concat([X_res,Y_res],axis=1) ## to conactenate the input & output data
df_fin
| Ind_ID | CHILDREN | Annual_income | Type_Income | EDUCATION | Age | work_exp_years | Work_Phone | Phone | EMAIL_ID | ... | Marital_status_Married | Marital_status_Separated | Marital_status_Single / not married | Marital_status_Widow | Housing_type_House / apartment | Housing_type_Municipal apartment | Housing_type_Office apartment | Housing_type_Rented apartment | Housing_type_With parents | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | 0 | 180000.000000 | 1.000000 | 1.000000 | 51.430137 | 5.350685 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 1 | 5009744 | 0 | 315000.000000 | 0.000000 | 1.000000 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 5009752 | 0 | 315000.000000 | 0.000000 | 1.000000 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 3 | 5009754 | 0 | 315000.000000 | 0.000000 | 1.000000 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 4 | 5018498 | 0 | 90000.000000 | 3.000000 | 4.000000 | 51.917808 | 2.745205 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2151 | 5060807 | 0 | 135000.000000 | 1.326560 | 4.000000 | 32.378086 | 4.985919 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2152 | 5009019 | 0 | 180000.000000 | 1.000000 | 1.541784 | 53.093586 | 5.350685 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2153 | 5135345 | 0 | 112500.000000 | 0.000000 | 1.000000 | 55.076712 | 1.520548 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2154 | 5091513 | 0 | 110701.346232 | 0.679940 | 4.000000 | 52.374028 | 5.483970 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2155 | 5149837 | 0 | 157500.000000 | 0.753187 | 1.000000 | 33.936986 | 3.630137 | 0 | 1 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
2156 rows × 24 columns
sns.pairplot(df_fin,hue='label')
<seaborn.axisgrid.PairGrid at 0x270929bf850>
X_res
| Ind_ID | CHILDREN | Annual_income | Type_Income | EDUCATION | Age | work_exp_years | Work_Phone | Phone | EMAIL_ID | ... | Car_Owner_Y | Marital_status_Married | Marital_status_Separated | Marital_status_Single / not married | Marital_status_Widow | Housing_type_House / apartment | Housing_type_Municipal apartment | Housing_type_Office apartment | Housing_type_Rented apartment | Housing_type_With parents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008827 | 0 | 180000.000000 | 1.000000 | 1.000000 | 51.430137 | 5.350685 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 5009744 | 0 | 315000.000000 | 0.000000 | 1.000000 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 5009752 | 0 | 315000.000000 | 0.000000 | 1.000000 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 5009754 | 0 | 315000.000000 | 0.000000 | 1.000000 | 37.142466 | 1.605479 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 5018498 | 0 | 90000.000000 | 3.000000 | 4.000000 | 51.917808 | 2.745205 | 1 | 1 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2151 | 5060807 | 0 | 135000.000000 | 1.326560 | 4.000000 | 32.378086 | 4.985919 | 0 | 0 | 0 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2152 | 5009019 | 0 | 180000.000000 | 1.000000 | 1.541784 | 53.093586 | 5.350685 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2153 | 5135345 | 0 | 112500.000000 | 0.000000 | 1.000000 | 55.076712 | 1.520548 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2154 | 5091513 | 0 | 110701.346232 | 0.679940 | 4.000000 | 52.374028 | 5.483970 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2155 | 5149837 | 0 | 157500.000000 | 0.753187 | 1.000000 | 33.936986 | 3.630137 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2156 rows × 23 columns
Y_res
0 1
1 1
2 1
3 1
4 1
..
2151 1
2152 1
2153 1
2154 1
2155 1
Name: label, Length: 2156, dtype: int64
from sklearn.model_selection import train_test_split ## we take 15 % for test data and 85 % for training
X_train, X_test, Y_train, Y_test = train_test_split(X_res, Y_res, test_size = 0.15, random_state = 5)
print(X_res.shape, X_train.shape ,Y_train.shape, X_test.shape) ## shape of test and train data
(2156, 23) (1832, 23) (1832,) (324, 23)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, Y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
y_pred = log_reg.predict(X_test)
accuracy_score(Y_test, y_pred)
0.8209876543209876
log_reg_score = log_reg.score(X_test,Y_test)
log_reg_score
0.8209876543209876
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier(max_depth=14,max_features='sqrt')
dec_tree.fit(X_train,Y_train)
DecisionTreeClassifier(max_depth=14, max_features='sqrt')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=14, max_features='sqrt')
y_pred_dt = dec_tree.predict(X_test)
accuracy_score(Y_test,y_pred_dt)
0.8796296296296297
y_pred_train = dec_tree.predict(X_train)
accuracy_score(Y_train,y_pred_train)
0.986353711790393
## since this model is overfitting so that hyperparameter tuning should be used to avoid that and select best parameters.
dt_score = dec_tree.score(X_test,Y_test)
dt_score
0.8796296296296297
from sklearn.ensemble import RandomForestClassifier
r_f = RandomForestClassifier()
r_f.fit(X_train,Y_train)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
r_f_pred = r_f.predict(X_test)
rf_score = r_f.score(X_test,Y_test)
rf_score
0.9598765432098766
Y_tr_val = r_f.predict(X_train)
accuracy_score(Y_train,Y_tr_val)
1.0
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,Y_train)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
k_score = knn.score(X_test,Y_test)
k_score
0.8796296296296297
model_scores={'logistic reg':log_reg_score,'Decision Tree':dt_score,'Random Forest':rf_score,'KNN':k_score}
model_compare = pd.DataFrame(model_scores, index=['accuracy'])
model_compare.T.plot.bar();
# Create a list of train scores
train_scores = []
# Create a list of test scores
test_scores = []
# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20
# Setup algorithm
knn = KNeighborsClassifier()
# Loop through different neighbors values
for i in neighbors:
knn.set_params(n_neighbors = i) # set neighbors value
# Update the scores
knn.fit(X_train, Y_train)
train_scores.append(knn.score(X_train, Y_train))
test_scores.append(knn.score(X_test, Y_test))
train_scores
[1.0, 0.9814410480349345, 0.9432314410480349, 0.9432314410480349, 0.9055676855895196, 0.9066593886462883, 0.8804585152838428, 0.8804585152838428, 0.8629912663755459, 0.8662663755458515, 0.8526200873362445, 0.8526200873362445, 0.8384279475982532, 0.8367903930131004, 0.8258733624454149, 0.8335152838427947, 0.8236899563318777, 0.8198689956331878, 0.8155021834061136, 0.8144104803493449]
plt.plot(neighbors, train_scores, label="Train score")
plt.plot(neighbors, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()
<matplotlib.legend.Legend at 0x270b39842d0>
print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")
Maximum KNN score on the test data: 92.90%
from sklearn.model_selection import RandomizedSearchCV
parameters = {'penalty':['l1','l2','elasticnet'],'C': np.logspace(-4, 4, 20),'solver': ["liblinear"]}
classifier_reg = RandomizedSearchCV(log_reg,param_distributions=parameters,scoring='accuracy',cv=5,verbose=True)
classifier_reg.fit(X_train, Y_train)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
C:\Users\papai\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:378: FitFailedWarning:
25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\papai\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\papai\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\papai\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 64, in _check_solver
raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\papai\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [0.77236491 nan nan 0.78654725 nan nan
0.78654725 0.78654725 0.78654725 nan]
warnings.warn(
RandomizedSearchCV(cv=5, estimator=LogisticRegression(),
param_distributions={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
'penalty': ['l1', 'l2', 'elasticnet'],
'solver': ['liblinear']},
scoring='accuracy', verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=5, estimator=LogisticRegression(),
param_distributions={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
'penalty': ['l1', 'l2', 'elasticnet'],
'solver': ['liblinear']},
scoring='accuracy', verbose=True)LogisticRegression()
LogisticRegression()
classifier_reg.best_params_
{'solver': 'liblinear', 'penalty': 'l2', 'C': 545.5594781168514}
classifier_reg.score(X_test, Y_test)
0.8209876543209876
rf_para = {"n_estimators": np.arange(10, 1000, 50),
"max_depth": [None, 3, 5, 10],
"min_samples_split": np.arange(2, 20, 2),
"min_samples_leaf": np.arange(1, 15, 2)}
rf_class = RandomizedSearchCV(r_f,param_distributions=rf_para,scoring='accuracy',cv=5,verbose=True)
rf_class.fit(X_train, Y_train)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
param_distributions={'max_depth': [None, 3, 5, 10],
'min_samples_leaf': array([ 1, 3, 5, 7, 9, 11, 13]),
'min_samples_split': array([ 2, 4, 6, 8, 10, 12, 14, 16, 18]),
'n_estimators': array([ 10, 60, 110, 160, 210, 260, 310, 360, 410, 460, 510, 560, 610,
660, 710, 760, 810, 860, 910, 960])},
scoring='accuracy', verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
param_distributions={'max_depth': [None, 3, 5, 10],
'min_samples_leaf': array([ 1, 3, 5, 7, 9, 11, 13]),
'min_samples_split': array([ 2, 4, 6, 8, 10, 12, 14, 16, 18]),
'n_estimators': array([ 10, 60, 110, 160, 210, 260, 310, 360, 410, 460, 510, 560, 610,
660, 710, 760, 810, 860, 910, 960])},
scoring='accuracy', verbose=True)RandomForestClassifier()
RandomForestClassifier()
rf_class.best_params_
{'n_estimators': 60,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_depth': 10}
rf_class.score(X_test, Y_test)
0.9506172839506173
Y_Train_pred = rf_class.predict(X_train)
accuracy_score(Y_train,Y_Train_pred)
0.9759825327510917
r_f_random = RandomForestClassifier(min_samples_split= 10,min_samples_leaf= 1,max_depth=10,n_estimators = 60)
r_f_random.fit(X_train,Y_train)
RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=60)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=60)
rfpredict = r_f_random.predict(X_test)
rfscore = r_f_random.score(X_test,Y_test)
rfscore
0.9567901234567902
y_predict = rf_class.predict(X_test)
y_predict
array([1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0], dtype=int64)
Y_test
1907 1
1654 1
708 0
1173 0
1701 1
..
1792 1
1620 1
433 0
824 0
1153 0
Name: label, Length: 324, dtype: int64
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
print(confusion_matrix(Y_test, y_predict))
[[159 8] [ 8 149]]
cm = confusion_matrix(Y_test, y_predict,labels = rf_class.classes_)
cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm ,display_labels=['Approved','Rejected'])
cm_disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x270c3c4af50>
print(classification_report(Y_test, y_predict))
precision recall f1-score support
0 0.95 0.95 0.95 167
1 0.95 0.95 0.95 157
accuracy 0.95 324
macro avg 0.95 0.95 0.95 324
weighted avg 0.95 0.95 0.95 324
rf_class.best_params_
{'n_estimators': 60,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_depth': 10}
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold = KFold(14)
results = cross_val_score(r_f_random,X_res,Y_res,cv=kfold)
print(results)
print(np.mean(results))
[0.66883117 0.94805195 0.96753247 0.92207792 0.92207792 0.95454545 0.92857143 0.96753247 0.96753247 0.93506494 0.94155844 0.95454545 0.97402597 0.96753247] 0.9299628942486086
r_f_random.feature_importances_
array([0.09731907, 0.02843513, 0.0802261 , 0.08616369, 0.07615891,
0.08966794, 0.10902073, 0.02901764, 0.02659929, 0.01244562,
0.07194758, 0.04547586, 0.0185979 , 0.04239487, 0.05520422,
0.01593217, 0.03787102, 0.01814458, 0.03685571, 0.00321747,
0.00051337, 0.00424384, 0.01454729])
# Match features to columns
feature_val = pd.DataFrame({'coef':r_f_random.feature_importances_},index = X_res.columns)
feature_val.sort_values(by = 'coef',ascending = False)
| coef | |
|---|---|
| work_exp_years | 0.109021 |
| Ind_ID | 0.097319 |
| Age | 0.089668 |
| Type_Income | 0.086164 |
| Annual_income | 0.080226 |
| EDUCATION | 0.076159 |
| Type_Occupation | 0.071948 |
| Marital_status_Married | 0.055204 |
| Family_Members | 0.045476 |
| Car_Owner_Y | 0.042395 |
| Marital_status_Single / not married | 0.037871 |
| Housing_type_House / apartment | 0.036856 |
| Work_Phone | 0.029018 |
| CHILDREN | 0.028435 |
| Phone | 0.026599 |
| GENDER_M | 0.018598 |
| Marital_status_Widow | 0.018145 |
| Marital_status_Separated | 0.015932 |
| Housing_type_With parents | 0.014547 |
| EMAIL_ID | 0.012446 |
| Housing_type_Rented apartment | 0.004244 |
| Housing_type_Municipal apartment | 0.003217 |
| Housing_type_Office apartment | 0.000513 |
## query : select count(*),Type_income,avg(Annual_income) from crdit_data group by Type_income
## query : select * from crdit_data where gender like 'F' and Car_owner ='Y' and Propert_owner like 'Y'
## Query : select * from crdit_data where gender = 'M' and Family_members >0
## query: select Ind_id, Annual_income from crdit_data order by Annual_income desc limit 5
# Findings : The highest salary of top 5 customers are : #### Ind_ID Income #### 5117323 382500 #### 5066817 360000 #### 5066597 360000 #### 5146463 360000 #### 5044935 360000
## query: select Ind_ID , Marital_status,Label from crdit_data where Marital_status='married' and label =1
## query: select education ,count(*)total_count from crdit_data group by education order by total_count desc limit 1
## Query : with
## cust_1 as(select * from crdit_data where marital_status in ('married','civil marriage')
## and gender like 'm' or gender like'f' and label=1),
## cust_MF as (select gender , count(*) total_count
## from cust_1 group by gender)
## select * from cust_MF